******************************************************************************************
* Do-file name:	an_data_siab_01.do 
* Task:         restricts sample and runs correlation analysis (Table A.1) 
* Last change:  17.10.2023 
* Notes:		Based on SIAB_R7519 data
/*
This file contains the code used to generate the results presented in the following table:
- Table A.1:
		-- Employment correlation
		-- Wage correlation
*/
******************************************************************************************


******************************************************************************************
*** program setup
******************************************************************************************

version 17.0
clear all
macro drop _all
set linesize 90
set more off
* set trace on
discard
set seed 123456789


******************************************************************************************
*** load data and restrict sample
******************************************************************************************

*** load data
use "data/work.dta", clear 

*** restrict sample
keep if native2 == 1		// only natives
keep if ost		== 0		// only west-germany
keep if age >= 18 & age <= 65
keep if year >= 1985 & year <= 2015

drop native pers_gr ausbildung_imp ausbildung_gr schule beruford beruf2010_gr grund alo_dau pendler w08_gen_gr


******************************************************************************************
*** create new variables
******************************************************************************************

*** create education dummies
tab edu_2, gen (edu_2_)
rename edu_2_1 edu_2_low
rename edu_2_2 edu_2_high

*** create age categories: 3 groups
gen age_3 = .
replace age_3 = 1  if age >= 18 & age <= 29
replace age_3 = 2  if age >= 30 & age <= 49
replace age_3 = 3  if age >= 50 & age <= 65
label var age_3 "age, 3 groups: 1 = 18-29, 2 = 30-49, 3 = 50-65"
tab age_3, gen (age_3_)


******************************************************************************************
*** prepare for aggregate data
******************************************************************************************

*** recreate balanced sample
xtset vsnr_ano year 
tsfill

*** create lead variables
gen emp_lead1         = F.emp
gen weight_fte_lead1  = F.weight_fte
*gen niveau_high_lead1 = F.niveau_high

*** create incumbent workers
gen emp_incum = weight_fte_lead1  if emp == 1 & emp_lead1 == 1

*** create wage of incumbent workers (only full-time worker used)
gen d_ln_wage_incum = D.ln_wage  if emp == 1 & L.emp == 1 & ln_wage != . & L.ln_wage != . & ao_kreis == L.ao_kreis

*** drop missing observations
keep if status == 1

*** save data
compress
save    "data/work_2.dta", replace


******************************************************************************************
*** collapse data on district level: employment sample
******************************************************************************************

*** drop variables (not needed anymore and would not collapse correctly)
drop berufstg edu_3 edu_2 age_3 emp_lead1 weight_fte_lead1 

*** district level: main variables (as we use FTE weights only employed individuals are considered (status 1)
collapse (mean)  ost female teilzeit edu_2_* age age_3_? tag_entg /// 
		  (sum)  emp ///
	   (rawsum)  emp_incum weight_fte  [iw=weight_fte], by (ao_kreis year)	

*** set panel dimension
sort  ao_kreis year
xtset ao_kreis year

*** create new variables
gen incum_emp = L.emp_incum		// move sum of incumbent workers to the correct year
drop emp_incum

gen l_emp = L.emp
gen d_emp = D.emp

gen g_emp = d_emp / l_emp
gen g_incum_emp = incum_emp / l_emp

*** create regression weight
bys ao_kreis: egen mean_weight_fte = mean(weight_fte)

*** residualize the incumbent and regional variables against year FEs
reg     g_incum_emp        i.year [pweight=l_emp]
predict g_incum_emp_resid, residuals

reg     g_emp        i.year [pweight=l_emp]
predict g_emp_resid, residuals

*** standardize variables
foreach var of varlist  g_emp  g_incum_emp  g_emp_resid  g_incum_emp_resid  {
center `var'  [pweight=l_emp]  if year != 1992 & year != 1999, standardize generate(`var'_std)
}

** check results
sum g_emp_std g_incum_emp_std g_emp_resid_std g_incum_emp_resid_std [aweight=l_emp] if year != 1992 & year != 1999


******************************************************************************************
*** results employment sample
******************************************************************************************

*** correlations
reg  g_incum_emp_resid_std   g_emp_resid_std  [pweight=l_emp] if year != 1992 & year != 1999
corr g_incum_emp_resid_std   g_emp_resid_std  [aweight=l_emp] if year != 1992 & year != 1999


******************************************************************************************
*** collapse data on district level: wage sample
******************************************************************************************

*** load data
use  "data/work_2.dta", clear

*** drop variables (not needed anymore and would not collapse correctly)
drop berufstg edu_3 edu_2 age_3 emp_lead1 weight_fte_lead1 

*** restict to full-time employed workers
keep if teilzeit == 0

*** district level: main variables (only full-time employed individuals are considered)
collapse (mean)  ost female teilzeit edu_2_* age age_3_? tag_entg ln_wage d_ln_wage_incum /// 
		  (sum)  emp emp_incum weight_fte, by (ao_kreis year)	

*** set panel dimension
sort  ao_kreis year
xtset ao_kreis year

*** create new variables
gen l_emp = L.emp
gen l_ln_wage = L.ln_wage

gen d_ln_wage = D.ln_wage
rename d_ln_wage_incum d_incum_ln_wage 

*** create regression weight
bys ao_kreis: egen mean_weight_fte = mean(weight_fte)

*** residualize the incumbent and regional variables against year FEs
reg     d_incum_ln_wage        i.year [pweight=l_emp]
predict d_incum_ln_wage_resid, residuals

reg     d_ln_wage        i.year [pweight=l_emp]
predict d_ln_wage_resid, residuals

*** standardize variables
foreach var of varlist  d_ln_wage  d_incum_ln_wage  d_ln_wage_resid  d_incum_ln_wage_resid {
center `var'  [pweight=l_emp]  if year != 1992 & year != 1999, standardize generate(`var'_std)
}

** check results
sum d_ln_wage_std d_incum_ln_wage_std d_ln_wage_resid_std d_incum_ln_wage_resid_std [aweight=l_emp] if year != 1992 & year != 1999


******************************************************************************************
*** results wage sample
******************************************************************************************

*** correlations
reg  d_incum_ln_wage_resid_std  d_ln_wage_resid_std  [pweight=l_emp] if year != 1992 & year != 1999
corr d_incum_ln_wage_resid_std  d_ln_wage_resid_std  [aweight=l_emp] if year != 1992 & year != 1999


******************************************************************************************
*** end
******************************************************************************************

erase "data/work.dta"
erase "data/work_2.dta"
erase "data/clean_work.dta"


exit

*========================================================================================*
Comments:
- unique identifier: vsnr_ano year
- wages are not imputed
